############################
#
# Replication FILE
# Journal of Substance
# Abuse Treatment Archive
# B. W. Campbell
# Last Updated:  11/05/2018
#
############################

#####################
# Write/prepare data
#####################
if(write.data == TRUE){

  options(stringsAsFactors = FALSE)
  
  ############
  #
  # Load data
  #
  ############
  
  dv <- read.csv("WCAttrib.csv")
  
  # subset by gender as our edges are only for males
  dv <- subset(dv, gender == "Male")
  
  # Since we are also interested in predicting recidivism, we should construct a new variable
  # A value of 1 is if the member at any point recidivated (regardless of if 1 or 2)
  dv$recidivist <- rep(0, times = nrow(dv))
  dv$recidivist[dv$recidflag1 > 0] <- 1
  dv$recidivist[dv$recidflag == 0] <- 0
  table(dv$recidivist)
  
  ################
  #
  # Load Networks
  # 
  ################
  
  # Edge list for affirmations
  ledge <- read.table("WCM-ledge")
  colnames(ledge) <- c("eventDate", "sender", "receiver", "countEvents")
  length(unique(ledge$eventDate)) # [1] 2043
  length(unique(ledge$sender)) # [1] 1661
  length(unique(ledge$receiver)) # [1] 1716
  
  # Edge list for corrections
  sedge <- read.table("WCM-sedge")
  colnames(sedge) <- c("eventDate", "sender", "receiver", "countEvents")
  length(unique(sedge$eventDate)) # [1] 2401
  length(unique(sedge$sender)) # [1] 1658
  length(unique(sedge$receiver)) # [1] 1710
  
  ################
  #
  # Clean data
  # 
  ################
  
  dv$enterClean <- as.Date(as.character(dv$enter),format = '%m/%d/%Y')
  dv$exitClean <- as.Date(as.character(dv$exit),format = '%m/%d/%Y')
  
  library(stringr)
  dayEnter <- str_sub(dv$enterClean, start = -2)
  monthEnter <- str_sub(dv$enterClean, start = -5, end = -4)
  yearEnter <- str_sub(dv$enterClean, start = 1, end = 4)
  
  dayExit <- str_sub(dv$exitClean, start = -2)
  monthExit <- str_sub(dv$exitClean, start = -5, end = -4)
  yearExit <- str_sub(dv$exitClean, start = 1, end = 4)
  
  enterClean <- paste(yearEnter, monthEnter, dayEnter, sep = '') 
  exitClean <- paste(yearExit, monthExit, dayExit, sep = '')
  dv$enterClean <- enterClean
  dv$exitClean <- exitClean
  head(dv)
  dv$row <- row.names(dv)
  
  # Clean network dates
  sedge$eventDateClean <- as.Date(as.character(sedge$eventDate),format = '%m/%d/%Y')

  day <- str_sub(sedge$eventDateClean, start = -2)
  month  <- str_sub(sedge$eventDateClean, start = -5, end = -4)
  year <- str_sub(sedge$eventDateClean, start = 1, end = 4)
  
  sedge$eventDateClean <- paste(year, month, day, sep = '') 
  
  ledge$eventDateClean <- as.Date(as.character(ledge$eventDate),format = '%m/%d/%Y')
  
  day <- str_sub(ledge$eventDateClean, start = -2)
  month  <- str_sub(ledge$eventDateClean, start = -5, end = -4)
  year <- str_sub(ledge$eventDateClean, start = 1, end = 4)
  
  ledge$eventDateClean <- paste(year, month, day, sep = '') 
  
  # to make it easier, I'm going to make the wcid numbers more workable
  
  nwcid <- as.character(dv$wcid)
  wcida <- str_sub(nwcid, start = 1, end = 2)
  wcidb <- str_sub(nwcid, start = 4, end = 5)
  wcidc <- str_sub(nwcid, start = 7, end = 9)
  nwcid <- paste(wcida, wcidb, wcidc, sep = '')
  dv$wcid_new <- nwcid
  dv$wcid_new <- as.numeric(dv$wcid)
  
  
  # Now, I need to check and see which observations are duplicated (as per Philip's suggestion)
  duplicatedObs <- dv[which(duplicated(dv$wcid) | duplicated(dv$wcid, fromLast = TRUE)),]
  # 13 individuals who came back
  
  repeatOffenders <- unique(duplicatedObs$wcid)
  
  sedge_repeat <- data.frame()
  ledge_repeat <- data.frame()
  dv_repeat <- data.frame()
  
  for(i in 1:length(repeatOffenders)){
    id <- repeatOffenders[i]
    records <- duplicatedObs[duplicatedObs$wcid == id,]
    
    # sort these repeat offenders so that itis first time in then last time, then label these
    records <- records[order(records$enterClean),] 
    timesRepeated <- 2
    newIds <- paste(id, ".", "Visit", 1:timesRepeated, sep = "")
    records$wcid <- newIds
    
    # then I want to do the same laeling according to sedge and ledge, both for senders and recievers, based upon timing
    sedge_repeat <- sedge[sedge$receiver == id | sedge$sender == id,]
    ledge_repeat <- ledge[ledge$receiver == id | ledge$sender == id,]
    
    temp_sedge <- data.frame()
    temp_ledge <- data.frame()
    
    for(t in 1:2){
      record_t <- records[t,]
      id_t <- newIds[t]
      
      sedge_t <- sedge_repeat[which(sedge_repeat$eventDateClean >= record_t$enterClean & sedge_repeat$eventDateClean <= record_t$exitClean),]
      sedge_t_senders <- which(sedge_t$sender == id)
      sedge_t$sender[sedge_t_senders] <- id_t
      sedge_t_receiver <- which(sedge_t$receiver == id)
      sedge_t$receiver[sedge_t_receiver] <- id_t
      
      ledge_t <- ledge_repeat[which(ledge_repeat$eventDateClean >= record_t$enterClean & ledge_repeat$eventDateClean <= record_t$exitClean),]
      ledge_t_senders <- which(ledge_t$sender == id)
      ledge_t$sender[ledge_t_senders] <- id_t
      ledge_t_receiver <- which(ledge_t$receiver == id)
      ledge_t$receiver[ledge_t_receiver] <- id_t
      
      temp_sedge <- rbind(temp_sedge, sedge_t)
      temp_ledge <- rbind(temp_ledge, ledge_t)
    }
    
    sedge_repeat <- rbind(sedge_repeat, temp_sedge)
    ledge_repeat <- rbind(ledge_repeat, temp_ledge)
    dv_repeat <- rbind(dv_repeat, records)
    # end loop
  }
  
  
  # Now, we must go through and update the sedge list to remove those repeated offenders and supplement with sedge_repeat and ledge_repeat
  sedge_remove <- c(which(sedge$sender %in% repeatOffenders), which(sedge$receiver %in% repeatOffenders))
  ledge_remove <- c(which(ledge$sender %in% repeatOffenders), which(ledge$receiver %in% repeatOffenders))
  
  sedge <- sedge[-c(sedge_remove),]
  ledge <- ledge[-c(ledge_remove),]
  
  sedge <- rbind(sedge, sedge_repeat)
  ledge <- rbind(ledge, ledge_repeat)
  
  # Do the same with dv
  dv <- dv[order(dv$enterClean),] 
  dv_remove <- c(which(dv$wcid %in% repeatOffenders))
  dv <- dv[-c(dv_remove),]
  dv <- rbind(dv, dv_repeat)
  
  # remove sedge or ledge where sender == 0
  sedge <- sedge[sedge$sender != 0,]
  ledge <- ledge[ledge$sender != 0,]
  
  # Now, let's produce edgelists of affirmations (ledge) and corrections (sedge)
  affirm_el <- data.frame(Source = ledge$sender,
                          Target = ledge$receiver)
  
  
  correct_el <- data.frame(Source = sedge$sender,
                           Target = sedge$receiver)
  
  # Turn these into weighted edge lists
  library(plyr)
  affirm_el <- ddply(affirm_el,.(Source,Target),nrow)
  colnames(affirm_el)[3] <- "Weight"
  
  correct_el <- ddply(correct_el,.(Source,Target),nrow)
  colnames(correct_el)[3] <- "Weight"
  
  # Turn these into adjacency
  library(igraph)
  g <- graph.data.frame(affirm_el)
  affirm_adj <- get.adjacency(g,sparse=FALSE, attr="Weight")
  affirm_adj <- list(t1 = affirm_adj)
  
  g <- graph.data.frame(correct_el)
  correct_adj <- get.adjacency(g,sparse=FALSE, attr="Weight")
  correct_adj <- list(t1 = correct_adj)
  
  ### Now, I should loop through all of these and create a separate record for further instances of recidivism while
  # also accounting for their last visit to a TC
  ids <- unique(dv$wcid)
  
  # 
  new_df <- data.frame()
  for(i in 1:length(ids)){
    id <- ids[i]
    record <- dv[which(dv$wcid == id),]
    
    # Now I need to determine if this person has reciivated
    if(record$recidivist > 0){
      recid_list <- data.frame()
      if(is.na(record$recidflag1) == FALSE){
        record$recidClean1 <- as.Date(as.character(record$recidate1),format = '%m/%d/%Y')
        dayExit <- str_sub(record$recidClean1, start = -2)
        monthExit <- str_sub(record$recidClean1, start = -5, end = -4)
        yearExit <- str_sub(record$recidClean1, start = 1, end = 4)
        record$recidClean1 <- paste(yearExit, monthExit, dayExit, sep = '')
        
        first_reincarceration <- data.frame(
          wcid = record$wcid,
          enterClean = record$enterClean,
          recidDate = record$recidClean1,
          exitClean = record$exitClean,
          recidFlag = 1,
          strata = 0,
          success = record$success,
          age = record$age,
          lsi = record$lsi,
          lsiExit = record$lsiExit,
          black = record$black)
        
        recid_list <- rbind(recid_list, first_reincarceration)
      }
      
      if(is.na(record$recidflag2) == FALSE){
        record$recidClean2 <- as.Date(as.character(record$recidate2),format = '%m/%d/%Y')
        dayExit <- str_sub(record$recidClean2, start = -2)
        monthExit <- str_sub(record$recidClean2, start = -5, end = -4)
        yearExit <- str_sub(record$recidClean2, start = 1, end = 4)
        record$recidClean2 <- paste(yearExit, monthExit, dayExit, sep = '')
        
        second_reincarceration <- data.frame(
          wcid = record$wcid,
          enterClean = record$enterClean,
          recidDate = record$recidClean2,
          exitClean = record$exitClean,
          recidFlag = 1,
          strata = 1,
          success = record$success,
          age = record$age,
          lsi = record$lsi,
          lsiExit = record$lsiExit,
          black = record$black)
        
        recid_list <- rbind(recid_list, second_reincarceration)
      }
      
      if(is.na(record$recidflag3) == FALSE){
        record$recidClean3 <- as.Date(as.character(record$recidate3),format = '%m/%d/%Y')
        dayExit <- str_sub(record$recidClean3, start = -2)
        monthExit <- str_sub(record$recidClean3, start = -5, end = -4)
        yearExit <- str_sub(record$recidClean3, start = 1, end = 4)
        record$recidClean3 <- paste(yearExit, monthExit, dayExit, sep = '')
        
        third_reincarceration <- data.frame(
          wcid = record$wcid,
          enterClean = record$enterClean,
          recidDate = record$recidClean3,
          exitClean = record$exitClean,
          recidFlag = 1,
          strata = 2,
          success = record$success,
          age = record$age,
          lsi = record$lsi,
          lsiExit = record$lsiExit,
          black = record$black)
        
        recid_list <- rbind(recid_list, third_reincarceration)
      }
      
      if(is.na(record$recidflag4) == FALSE){
        record$recidClean4 <- as.Date(as.character(record$recidate4),format = '%m/%d/%Y')
        dayExit <- str_sub(record$recidClean4, start = -2)
        monthExit <- str_sub(record$recidClean4, start = -5, end = -4)
        yearExit <- str_sub(record$recidClean4, start = 1, end = 4)
        record$recidClean4 <- paste(yearExit, monthExit, dayExit, sep = '')
        
        fourth_reincarceration <- data.frame(
          wcid = record$wcid,
          enterClean = record$enterClean,
          recidDate = record$recidClean3,
          exitClean = record$exitClean,
          recidFlag = 1,
          strata = 3,
          success = record$success,
          age = record$age,
          lsi = record$lsi,
          lsiExit = record$lsiExit,
          black = record$black)
        
        recid_list <- rbind(recid_list, fourth_reincarceration)
      }
      
    } else {
      recid_list <- data.frame(
        wcid = record$wcid,
        enterClean = record$enterClean,
        recidDate = NA,
        exitClean = record$exitClean,
        recidFlag = 0,
        strata = 0,
        success = record$success,
        age = record$age,
        lsi = record$lsi,
        lsiExit = record$lsiExit,
        black = record$black)
    }
    new_df <- rbind(new_df, recid_list)
  }
  
  # Now I need to remove records for individuals who go back to the therapeutic community after recidivating 
  # loop through observations that end in .1 and .2, at which point I determine the instances of reincarceration
  # that occur following hte most recent visit
  
  mult_visits_Full <- rbind(new_df[grep("Visit1", new_df$wcid),], new_df[grep("Visit2", new_df$wcid),])
  
  mult_visits <- mult_visits_Full[mult_visits_Full$recidFlag == 1,]
  
  mult_visits_NOR <- mult_visits_Full[mult_visits_Full$recidFlag == 0,]
  # Keep second attempts
  seconds <- mult_visits_NOR[grep("Visit2", mult_visits_NOR$wcid),]
  
  ids_checked <- unique(duplicatedObs$wcid)
  adjusted <- data.frame()
  for(i in 1:length(ids_checked)){
    ind_set <- mult_visits[grep(ids_checked[i], mult_visits$wcid),]
    # now, go through this ind_set and determine which observations I should keep,
    # which are the relevant visits for recidivism, the most recent
    # which of these visits according to exit clean was closest to the recid date,
    # but there might be multiple cases of recidivism
    recid_dates <- unique(ind_set$recidDate)
    id_rows <- data.frame()
    for(r in 1:length(recid_dates)){
      temp <- ind_set[ind_set$recidDate == recid_dates[r],]
      # find the visits that ended before recidivism
      temp <- temp[which(as.numeric(temp$exitClean) <= as.numeric(recid_dates[r])),]
      # find the exit closest to the recid_date
      temp <- temp[which.min(as.numeric(recid_dates[r]) - as.numeric(temp$exitClean)),]
      id_rows <- rbind(id_rows, temp)
    }
    adjusted <- rbind(adjusted, id_rows)
  }
  
  seconds_complete <- rbind(seconds, adjusted)
  
  # now, I need to get a data frame completely devoid of those duplicate observations
  single_visits <- new_df[-c(grep("Visit", new_df$wcid)),]
  
  dv <- rbind(single_visits, seconds_complete)
  
  
  # for right censoring, recode NA's for recidDate as 20090904
  dv[is.na(dv$recidDate),]$recidDate <- "20090904"
  
  library(survival)
  dv$exitDays <- difftime(as.Date(dv$exitClean, format = "%Y%m%d"), as.Date("2001-01-05"), units = "days") # days
  dv$recidDays <- difftime(as.Date(dv$recidDate, format = "%Y%m%d"), as.Date("2001-01-05"), units = "days") # days 
  
  dv$gap <- c(dv$recidDays-dv$exitDays)
  
  dv <- with(dv, data.frame(
    exitClean = dv$exitClean,
    enterClean = dv$enterClean,
    success = success,
    wcid = wcid,
    gap = gap,
    recidFlag = recidFlag,
    strata = strata,
    age = age,
    lsiExit = lsiExit,
    lsi = lsi,
    black = black
  ))
  
  dv <- na.omit(dv)
  
  # remove those who recid, no expectation that TC should inform after first recid following
  
  dv <- dv[dv$strata <1,]
  ### get network covariates
  library(tnam)
  
  recid_list <- dv$recidFlag
  names(recid_list) <- dv$wcid
  recid_list <- list(t1 = recid_list)
  
  success_list <- dv$success
  names(success_list) <- dv$wcid
  success_list <- list(t1 = success_list)
  
  # create a reverse coded version
  
  gap_list <- as.numeric(dv$gap)
  names(gap_list) <- dv$wcid
  gap_list <- list(t1 = gap_list)
  
  age_list <- as.numeric(dv$age)
  names(age_list) <- dv$wcid
  age_list <- list(t1 = age_list)
  
  black_list <- dv$black
  names(black_list) <- dv$wcid
  black_list <- list(t1 = black_list)
  
  lsi_exit_list <- dv$lsiExit
  names(lsi_exit_list) <- dv$wcid
  lsi_exit_list <- list(t1 = lsi_exit_list)
  
  lsi_list <- dv$lsi
  names(lsi_list) <- dv$wcid
  lsi_list <- list(t1 = lsi_list)
  
  all_1s <- rep(1, times = nrow(dv))
  names(all_1s) <- dv$wcid
  all_1s <- list(t1 = all_1s)
  
  library(lubridate)
  finish <- ymd(dv$exitClean)
  start <- ymd(dv$enterClean)
  elapsed <-  difftime(finish,start,units="days")
  
  names(elapsed) <- dv$wcid
  days_list <- list(t1 = elapsed)
  
  tnam_out <- tnam(recid_list ~
                     netlag(success_list, correct_adj, pathdist = 1, coefname = "Correct_1") +
                     netlag(success_list, correct_adj, pathdist = 2, coefname = "Correct_2") +
                     weightlag(success_list, correct_adj, coefname = "WeightCorrect_1") +
                     centrality(correct_adj, type = "indegree", coefname = "Correct_InCent") +
                     centrality(correct_adj, type = "outdegree", coefname = "Correct_OutCent") +
                     centrality(correct_adj, type = "eigenvector", coefname = "Correct_EigenVectorCent") +
                     structsim(success_list, correct_adj, method = "euclidean", coefname = "Correct_StructSim_Euclidean")+
                     structsim(success_list, correct_adj, method = "jaccard", coefname = "Correct_StructSim_Jaccard")+
                     clustering(correct_adj, coefname = "Correct_Clustering_In") +
                     netlag(success_list, affirm_adj, pathdist = 1, coefname = "Affirm_1") +
                     netlag(success_list, affirm_adj, pathdist = 2, coefname = "Affirm_2") +
                     weightlag(success_list, affirm_adj, coefname = "WeightAffirm_1") +
                     centrality(affirm_adj, type = "indegree", coefname = "Affirm_InCent") +
                     centrality(affirm_adj, type = "outdegree", coefname = "Affirm_OutCent") +
                     centrality(affirm_adj, type = "eigenvector", coefname = "Affirm_EigenVectorCent") +
                     clustering(affirm_adj, coefname = "Affirm_Clustering_In") +
                     structsim(success_list, affirm_adj, method = "euclidean", coefname = "Affirm_StructSim_Euclidean")+
                     structsim(success_list, affirm_adj, method = "jaccard", coefname = "Affirm_StructSim_Jaccard")+
                     covariate(gap_list, coefname = "Gap") +
                     covariate(age_list, coefname = "Age") +
                     covariate(black_list, coefname = "Black") +
                     covariate(lsi_exit_list, coefname = "LSI_Exit") +
                     covariate(lsi_list, coefname = "LSI") +
                     covariate(days_list, coefname = "Days") +
                     netlag(all_1s, affirm_adj, pathdist = 1, coefname = "Affirm_1Paths") +
                     netlag(all_1s, affirm_adj, pathdist = 2, coefname = "Affirm_2Paths") +
                     netlag(all_1s, affirm_adj, pathdist = 3, coefname = "Affirm_3Paths") +
                     netlag(all_1s, affirm_adj, pathdist = 4, coefname = "Affirm_4Paths") + 
                     netlag(all_1s, correct_adj, pathdist = 1, coefname = "Correct_1Paths") +
                     netlag(all_1s, correct_adj, pathdist = 2, coefname = "Correct_2Paths") +
                     netlag(all_1s, correct_adj, pathdist = 3, coefname = "Correct_3Paths") +
                     netlag(all_1s, correct_adj, pathdist = 4, coefname = "Correct_4Paths") +
                     centrality(affirm_adj, type = "closeness", coefname = "Affirm_Closeness") +
                     centrality(correct_adj, type = "closeness", coefname = "Correct_Closeness"),
                   re.node = FALSE, 
                   time.linear = FALSE)
  
  dv <- tnam_out$model
  colnames(dv) <- c("recidFlag", "Correct1", "Correct2", "WeightCorrect", "CorrectIn", "CorrectOut", "CorrectEigen", "CorrectStructSim_Euclidean", "CorrectStructSim_Jaccard", "CorrectClustering",
                    "Affirm1", "Affirm2", "WeightAffirm", "AffirmIn", "AffirmOut", "AffirmEigen", "AffirmClustering", "AffirmStructSim_Euclidean", "AffirmStructSim_Jaccard", 
                    "Gap", "Age", "Black", "LSI_Exit", "LSI", "Days", 
                    "Affirm_1Paths", "Affirm_2Paths", "Affirm_3Paths", "Affirm_4Paths", 
                    "Correct_1Paths", "Correct_2Paths", "Correct_3Paths", "Correct_4Paths",
                    "AffirmCloseness", "CorrectCloseness")
  
  
  save(dv, file = "CCDW_SurvivalData.RData")
} else {
  load("CCDW_SurvivalData.RData")
}

######################
# Table 1
######################

library(survival)
dv <- dv[dv$Gap > 0,]
surv_out <- Surv(dv$Gap, dv$recidFlag)
dv$id <- paste0("ID", 1:nrow(dv))


cox1 <- coxph(surv_out ~
                I(log(Age+1))+
                #    I(log(LSI+1)) +
                #            Black +
                #            I(log(Days+1)) +
                #              AffirmClustering +
                #              Affirm1 +
                #              Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)


cox2 <- coxph(surv_out ~
                I(log(Age+1))+
                #             I(log(LSI+1)) +
                Black +
                #            I(log(Days+1)) +
                #              AffirmClustering +
                #              Affirm1 +
                #              Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

cox3 <- coxph(surv_out ~
                I(log(Age+1))+
                #            I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                #              AffirmClustering +
                #              Affirm1 +
                #              Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

cox4 <- coxph(surv_out ~
                I(log(Age+1))+
                #            I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                #              Affirm1 +
                #              Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

cox5 <- coxph(surv_out ~
                I(log(Age+1))+
                #            I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                Affirm1 +
                #              Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)


cox6 <- coxph(surv_out ~
                I(log(Age+1))+
                #    I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                Affirm1 +
                Affirm2 +
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

cox7 <- coxph(surv_out ~
                I(log(Age+1))+
                #    I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                Affirm1 +
                Affirm2 +
                AffirmEigen+
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

cox8 <- coxph(surv_out ~
                I(log(Age+1))+
                #    I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                WeightAffirm +
                Affirm2 +
                AffirmEigen+
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)

sink("Table1.txt")
texreg::screenreg(l = list(cox1, cox2, cox3, cox4, cox5, cox6, cox7, cox8),
                  custom.coef.names = c("Age", "Black", "Days", "Affirmations Network Clustering Coefficient", "Direct Affirmations, Successful Peers",
                                        "Indirect Affirmations, Successful Peers", "Affirmations Network Eigenvector Centrality", "Weighted Affirmations, Successful Peers"))
sink()



######################
# Figure 1 & Table 2
######################


describe <- function(variable = NULL, type = c("continuous", "binary"), plot_title = "Variable Density"){
  require(ggplot2)
  require(reshape2)
  if(type == "continuous"){
    dist <- summary(variable)
    missingness <- table(is.na(variable))
    plot_df <- melt(variable)
    summary_plot <- ggplot(data = plot_df, aes(x = value)) +
      geom_density(fill = "firebrick4", color = "black") +
      theme_bw() +
      xlab("Variable Value") +
      ylab("Density") +
      ggtitle(plot_title)
  }
  
  if(type == "binary"){
    dist <- table(variable)
    missingness <- table(is.na(variable))
    plot_df <- melt(variable)
    summary_plot <- ggplot(data = plot_df, aes(x = value)) +
      geom_histogram(fill = "firebrick4", color = "black", bins = 2) +
      theme_bw() +
      xlab("Variable Value") +
      ylab("Count") +
      ggtitle(plot_title) +
      scale_x_discrete(limits=c(0, 1))
  }
  return(list(dist = dist, missingness = missingness, summary_plot = summary_plot))
}

events <- describe(dv$recidFlag, type = "binary", plot_title = "Recidivism")
times <- describe(dv$Gap, type = "continuous", plot_title = "Time to Event")
age <- describe(log(dv$Age+1), type = "continuous", plot_title = "Log Age")
black <- describe(dv$Black, type = "binary", plot_title = "Race")
days <- describe(log(dv$Days+1), type = "continuous", plot_title = "Log Days")
clustcoef <- describe(dv$AffirmClustering, type = "continuous", plot_title = "Clustering Coefficient")
affirm1 <- describe(dv$Affirm1, type = "continuous", plot_title = "Direct Affirm.")
affirm2 <- describe(dv$Affirm2, type = "continuous", plot_title = "Indirect Affirm.")
affirmWeight <- describe(dv$WeightAffirm, type = "continuous", plot_title = "Weighted Affirm.")
eigenvector <- describe(dv$AffirmEigen, type = "continuous", plot_title = "Eigen. Centrality")

png("Figure1.png", height = 480, width = 1000)
Rmisc::multiplot(plotlist = list(events$summary_plot, times$summary_plot, age$summary_plot, black$summary_plot, days$summary_plot, clustcoef$summary_plot, 
                                 affirm1$summary_plot, affirm2$summary_plot, affirmWeight$summary_plot, eigenvector$summary_plot), cols = 5)
dev.off()

dist <- list(events = events$dist, times = times$dist, age = age$dist,
             black = black$dist, days = days$dist, clustcoef= clustcoef$dist,
             affirm1 = affirm1$dist, affirm2 = affirm2$dist, affirmWeight = affirmWeight$dist,
             eigenvector=eigenvector$dist)

sink("Table2.txt")
dist
sink()

###########################
# transition probabilities
###########################

library(riskRegression)
cox7 <- coxph(Surv(Gap, recidFlag) ~
                I(log(Age+1))+
                #    I(log(LSI+1)) +
                Black +
                I(log(Days+1)) +
                AffirmClustering +
                Affirm1 +
                Affirm2 +
                AffirmEigen+
                frailty(id, distribution="gamma", method="em", eps = 0.005),
              method = c("efron"),
              x = TRUE, 
              y = TRUE,
              data = dv)
affirmClustMean = mean(dv$AffirmClustering)
affirmClustSD = sd(dv$AffirmClustering)
# thin all values to avoid DOF problems, take first, second, and then every other after value after the second
values_to_take = sort(unique(dv$AffirmClustering))[c(1,seq(2, length(unique(dv$AffirmClustering)), 2))]
new_data_cox = data.frame(
  id = c(as.character(1:length(values_to_take))),
  Age = mean(dv$Age),
  Black = median(dv$Black),
  Days = mean(dv$Days),
  AffirmClustering = values_to_take,
  # AffirmClustering = unname(quantile(dv$AffirmClustering, c(0, 0.01, .1, .2, .3, .4, .5, .6, .7, .8, .9, 0.99, 1))),
  # AffirmClustering = c(0, affirmClustMean-1*affirmClustSD, affirmClustMean, affirmClustMean+1*affirmClustSD, affirmClustMean+2*affirmClustSD, affirmClustMean+3*affirmClustSD, affirmClustMean+4*affirmClustSD),
  Affirm1 = mean(dv$Affirm1),
  Affirm2 = mean(dv$Affirm2),
  AffirmEigen = mean(dv$AffirmEigen)
)

set.seed(12345)
basehaz = predictCox(cox7, newdata = new_data_cox, times = 365, band = TRUE, iid=FALSE, store.iid = "minimal")


# want survival, which is the probability of not committing a crime at 365 days

plot_df <- data.frame(
  X = new_data_cox$AffirmClustering,
  Y = basehaz$survival,
  Y_Min = basehaz$survival.lowerBand,
  Y_Max = basehaz$survival.upperBand
)

library(ggplot2)
p = ggplot(data = plot_df, aes(x = X, y = Y)) +
  geom_line(col = "firebrick4", size = 1.5) +
  geom_ribbon(aes(ymin=Y_Min, ymax=Y_Max), col = "firebrick4", fill = "firebrick4", alpha = 0.25)  + 
  theme_bw() + 
  theme(legend.position = c(0.2, 0.8), 
        axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold")) +
  # ggtitle("4C: Clustering Middle-High Strata")+
  ylab('Probability of Not Committing Another Crime') +
  xlab('Clustering Coefficient in Affirmation Network')


png("Figure2.png")
p
dev.off()

interp_cox = data.frame(
  id = c(as.character(1:5)),
  Age = mean(dv$Age),
  Black = median(dv$Black),
  Days = mean(dv$Days),
  # AffirmClustering = values_to_take,
  # AffirmClustering = unname(quantile(dv$AffirmClustering, c(0, 0.01, .1, .2, .3, .4, .5, .6, .7, .8, .9, 0.99, 1))),
  AffirmClustering = c(0, affirmClustMean-1*affirmClustSD, affirmClustMean, affirmClustMean+1*affirmClustSD, affirmClustMean+2*affirmClustSD),
  Affirm1 = mean(dv$Affirm1),
  Affirm2 = mean(dv$Affirm2),
  AffirmEigen = mean(dv$AffirmEigen)
)

set.seed(12345)
basehaz_interp = predictCox(cox7, newdata = interp_cox, times = 365, band = TRUE, iid=FALSE, store.iid = "minimal")

sink("InterpProbs.txt")
plot_df <- data.frame(
  X = c("0 value", "1 SD below mean", "mean", "1 SD above mean", "2 SD above mean"),
  Y = basehaz_interp$survival,
  Y_Min = basehaz_interp$survival.lowerBand,
  Y_Max = basehaz_interp$survival.upperBand
)
plot_df
sink()
